#Import all the necessary libraries
import pandas as pd # Used for data wrangling
import plotly.express as px
import matplotlib.pyplot as plt # Used for data visualization
from statsmodels.tsa.ar_model import AutoReg # The model to predict timeseries
from sklearn.metrics import mean_absolute_error, r2_score # Evaluation function
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf #Auto correlation visualization.
import plotly.io as pio
pio.renderers.default='notebook' # Used to render plotly viz to stay on jupyter.
import warnings
warnings.filterwarnings('ignore') #Filter warnings
temperature_df = pd.read_csv("data/temperature.csv") # read the temperature data
# Mask the datetime column and the city we will be working with
#(I.e only extract the columns we will be working with).
df = temperature_df[["datetime", "New York"]]
#Rename newyork to temperature
df = df.rename(columns = {"New York": "Temperature"})
# convert to datatime object
df["datetime"] = pd.to_datetime(df["datetime"])
# Set the index to the datetime column and then convert the dataframe into series
df = df.set_index("datetime").squeeze()
#Localize the Timestamp From UTC to A newyork time stamp
df.index = df.index.tz_localize("UTC").tz_convert('America/New_York')
# Resample it into 1 hour frequency
df = df.resample("H").mean()
df.head()
datetime 2012-10-01 08:00:00-04:00 NaN 2012-10-01 09:00:00-04:00 288.220000 2012-10-01 10:00:00-04:00 288.247676 2012-10-01 11:00:00-04:00 288.326940 2012-10-01 12:00:00-04:00 288.406203 Freq: H, Name: Temperature, dtype: float64
# Check for missing values
df.isna().sum()
793
We have 793 records that have missing values.
df.describe()
count 44460.000000 mean 285.400406 std 10.220932 min 250.774000 25% 277.370000 50% 285.870000 75% 293.760000 max 310.240000 Name: Temperature, dtype: float64
We can see from the above summary that the minimum temperature is approximately 250 and the maximum is approximately 310
fig = px.histogram(
df,
nbins=50,
title = "Distribution of the Temperature Readings"
)
fig.show()
We can see from the above summary that the temperature of Newyork is normally distributed over the years.
fig = px.line(
df,
title="Readings of the Temperature in the Window of 1HR over the Years."
)
fig.show()
There is a trend with the temperature in new york as we can see that the readings of the temperature over the years are in form of a sine wave.
mask = df.resample("D").mean()
fig = px.line(
mask,
title="Readings of the Temperature in the Window of 24HR over the Years.",
hover_name=mask.index
)
fig.show()
We can see from the above dataset that the day that has the lowest number of temperature was 20th of febraury 2015 which the value is 258.5913 or -14.56 celsius approximately
Also we can see from the above dataset that the day which have the highest temperature was 2oth of July 2013 with a kelvin value of 304.77 equivalent to 31.62 degree celsius which makes it the third hottest temperature in the history of New york
fig = px.line(
df.resample("M").mean(),
title="Readings of the Temperature in the Window of 1 Month over the Years."
)
fig.show()
We can see from the above summary that Febraury 2015 was the month which had the lowest temperature value on average over the years.
Also July 2013 was the month that had the highest temperature over years on average.
fig = px.line(
df.resample("Q").mean(),
title="Readings of the Temperature in the Window of Quater over the Years."
)
fig.show()
The first quarter of 2015 was the quarter that had the highest dip over the years in NewYork
df.resample("A").mean()
datetime 2012-12-31 00:00:00-05:00 281.453499 2013-12-31 00:00:00-05:00 285.342590 2014-12-31 00:00:00-05:00 284.515356 2015-12-31 00:00:00-05:00 284.949812 2016-12-31 00:00:00-05:00 286.055159 2017-12-31 00:00:00-05:00 287.504149 Freq: A-DEC, Name: Temperature, dtype: float64
fig = px.line(
df.resample("A").mean(),
title="Readings of the Temperature at the End of Each Year."
)
fig.show()
We can see upward trend of temperature value moving over the years. This could result becuase of global warming.
df.fillna(method='bfill', inplace=True)
df.fillna(method="ffill", inplace=True)
df.head()
datetime 2012-10-01 08:00:00-04:00 288.220000 2012-10-01 09:00:00-04:00 288.220000 2012-10-01 10:00:00-04:00 288.247676 2012-10-01 11:00:00-04:00 288.326940 2012-10-01 12:00:00-04:00 288.406203 Freq: H, Name: Temperature, dtype: float64
fig, ax = plt.subplots(figsize=(15, 6))
plot_acf(df, ax=ax)
plt.xlabel("Lag` [hours]")
plt.ylabel("Correlation Coefficient");
fig, ax = plt.subplots(figsize=(15, 6))
plot_pacf(df, ax=ax, method="ywm")
plt.xlabel("Lag [hours]")
plt.ylabel("Correlation Coefficient");
cutoff_test = int(len(df) * 0.9) # Using 90% of the data for training
y_train = df.iloc[:cutoff_test]
y_test = df.iloc[cutoff_test:]
y_train_mean = y_train.mean()
y_pred_baseline = [y_train_mean] * len(y_train)
mae_baseline = mean_absolute_error(y_train, y_pred_baseline)
print("Mean P2 Reading:", round(y_train_mean, 2))
print("Baseline MAE:", round(mae_baseline, 2))
Mean P2 Reading: 284.59 Baseline MAE: 8.57
We used find the mean of the all the entire temperature to serve as the prediction then we checked the metrics which resulted into 8.23. So therfore we need to build a model that its metrics must be lower than that.
p_params = range(1, 45)
maes = []
for p in p_params:
model = AutoReg(y_train, lags = p).fit()
y_preds = model.predict().dropna()
maes.append(mean_absolute_error(y_train[p:], y_preds))
mae_series = pd.Series(maes, name="mae", index=p_params)
mae_series.sort_values().head()
43 0.539295 44 0.539317 42 0.539919 41 0.540170 40 0.540445 Name: mae, dtype: float64
We can see that the 43 hyperparameter can help our model to accurately predict the time series as it mean absolute error was the lowest.
y_pred_wfv = pd.Series()
history = y_train.copy()
for i in range(len(y_test)):
model_wfv = AutoReg(history, lags=43).fit()
next_pred = model_wfv.forecast()
y_pred_wfv =y_pred_wfv.append(next_pred)
history = history.append(y_test[next_pred.index])
y_pred_wfv.head()
2017-05-25 07:00:00-04:00 286.339709 2017-05-25 08:00:00-04:00 286.645965 2017-05-25 09:00:00-04:00 286.813751 2017-05-25 10:00:00-04:00 287.104280 2017-05-25 11:00:00-04:00 287.217840 Freq: H, dtype: float64
result = {"y_test":y_test, "y_pred_wfv":y_pred_wfv}
df_pred_test = pd.DataFrame(result)
fig = px.line(df_pred_test)
fig.update_layout(
title="Weather Forecasting, WFV Predictions",
xaxis_title="Date",
yaxis_title="PM2.5 Level",
)
mean_absolute_error(y_test, y_pred_wfv)
0.36939025645673274
r2_score(y_test, y_pred_wfv)
0.9907992326314755
We can see that our model performed well with 99% accuracy